Preparations

Load libraries and functions

library("cluster")
library("dendextend")
## 
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
source("functions.R")
## Loading required package: ggplot2

Corpus description and selection

Load data

# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "../dh-meier-data/output/transkribus-etudiants/tokenized/boudams", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_student_expanded_words.csv")
data = read.csv("data/transkr_student_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)

Text lengths

nwords = colSums(data)
summary(nwords)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       7    2239    3426    5021    6754   18989
boxplot(nwords)
boxplot(nwords)$out

## 05_Ano_Leg-A_Ap_NA_Vie_Jacques  29_Wau_Leg-C_Co_Ev_Vie_Martin 
##                          18048                          14458 
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial 
##                          18989                          15306
head(sort(nwords), n = 15)
## Item_Aut_Leg-Légendier_Cl_so__TitreRaccourci 
##                                            7 
##                  03_Ano_Leg-A_Ap_NA_Mar_Jean 
##                                          294 
##                  62_Ano_Leg-N_NA_NA_NA_Index 
##                                          325 
##               61_Ano_Leg-B_NA_NA_NA_Jugement 
##                                          383 
##               30_Wau_Leg-C_Co_Ev_Tra_Martin2 
##                                          726 
##              08_Ano_Leg-A_Ap_NA_Vie_Philippe 
##                                         1021 
##             59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 
##                                         1279 
##         09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 
##                                         1372 
##                 32_Wau_Leg-C_Co_Ev_Vie_Brice 
##                                         1385 
##            60_Ano_Leg-B_NA_NA_NA_Antechriste 
##                                         1501 
##               54_Ano_Leg-C_Vi_NA_Vie_Pelagie 
##                                         1514 
##              20_Ano_Leg-B_Ma_Fe_Vie_Felicite 
##                                         1698 
##                  11_Ano_Leg-A_Ap_NA_Vie_Marc 
##                                         1825 
##                 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 
##                                         1899 
##            53_Ano_Leg-C_Vi_NA_Vie_Marguerite 
##                                         1946
toKeep = colnames(data)[nwords > 1000]

toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]

df = as.data.frame(nwords)

ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) +  theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))

Transkribus raw data

3-grams from raw data

Load data

# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "../dh-meier-data/output/transkribus-etudiants/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_student_raw_char3grams.csv")
data = read.csv("data/transkr_student_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")

somCAH = somCluster(d)
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")

Class descriptions, Wauchier and StLambert

Classes members

classes = cutree(myCAH, k = 5)
classes
##         00_Ano_Leg-A_Ap_Ev_Dis_Pierre1         01_Ano_Leg-A_Ap_NA_Vie_Pierre2 
##                                      1                                      1 
##            02_Ano_Leg-A_Ap_NA_Pas_Paul         04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev 
##                                      1                                      1 
##         05_Ano_Leg-A_Ap_NA_Vie_Jacques        06_Ano_Leg-A_Ap_NA_Vie_Matthieu 
##                                      1                                      2 
##       07_Ano_Leg-A_Ap_NA_Vie_SimonJude        08_Ano_Leg-A_Ap_NA_Vie_Philippe 
##                                      2                                      2 
##   09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur      10_Ano_Leg-A_Ap_NA_Vie_Barthelemy 
##                                      2                                      2 
##            11_Ano_Leg-A_Ap_NA_Vie_Marc          12_Ano_Leg-A_Ma_Ho_Vie_Longin 
##                                      2                                      2 
##       13_Ano_Leg-B_Ma_Ho_Vie_Sebastien         14_Ano_Leg-B_Ma_Ho_Vie_Vincent 
##                                      2                                      2 
##         15_Ano_Leg-B_Ma_Ho_Vie_Georges      16_Ano_Leg-B_Ma_Ho_Vie_Christophe 
##                                      2                                      2 
##          17_Ano_Leg-B_Ma_Fe_Vie_Agathe            18_Ano_Leg-B_Ma_Fe_Vie_Luce 
##                                      2                                      2 
##           19_Ano_Leg-B_Ma_Fe_Vie_Agnes        20_Ano_Leg-B_Ma_Fe_Vie_Felicite 
##                                      2                                      2 
##       21_Ano_Leg-B_Ma_Fe_Vie_Christine          22_Ano_Leg-B_Ma_Fe_Vie_Cecile 
##                                      2                                      2 
##           23_Ano_Leg-B_Ma_Ho_Vie_Sixte         24_Ano_Leg-B_Ma_Ho_Vie_Laurent 
##                                      2                                      2 
##       25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte         26_Ano_Leg-B_Ma_Ev_Vie_Lambert 
##                                      2                                      3 
##       27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon         28_Ano_Leg-B_Ma_Ho_Vie_Clement 
##                                      4                                      5 
##          29_Wau_Leg-C_Co_Ev_Vie_Martin         31_Wau_Leg-C_Co_Ev_Dia_Martin3 
##                                      3                                      3 
##           32_Wau_Leg-C_Co_Ev_Vie_Brice          33_Wau_Leg-C_Co_Er_Vie_Gilles 
##                                      3                                      3 
##         34_Wau_Leg-C_Co_Ev_Vie_Martial         35_Wau_Leg-C_Co_Ev_Vie_Nicolas 
##                                      3                                      3 
##        36_Wau_Leg-C_Co_Ev_Mir_Nicolas2        37_Wau_Leg-C_Co_Ev_Tra_Nicolas3 
##                                      3                                      3 
##          38_Wau_Leg-C_Co_Ev_Vie_Jerome          39_Wau_Leg-C_Co_Ev_Vie_Benoit 
##                                      3                                      3 
##          40_Wau_Leg-C_Co_Er_Vie_Alexis           41_Ano_Leg-C_Vi_NA_Vie_Irene 
##                                      3                                      5 
##       42_Ano_Leg-B_Vi_NA_Ass_NotreDame       43_Ano_Leg-C_Vi_NA_Vie_Catherine 
##                                      4                                      4 
##           44_Ano_Leg-C_Ap_NA_Vie_Andre          45_Ano_Leg-C_Ap_NA_Pas_Andre2 
##                                      4                                      4 
##         46_Ano_Leg-B_Co_NA_Pur_Patrice      47_Ano_Leg-C_Co_er_Vie_PaulErmite 
##                                      5                                      5 
##         48_Ano_Leg-C_Co_ev_Tra_Benoit2            49_Ano_Leg-C_NA_NA_Vie_Maur 
##                                      5                                      5 
##         50_Ano_Leg-C_NA_NA_Vie_Placide        51_Ano_Leg-C_Ma_ho_Vie_Eustache 
##                                      5                                      5 
##           52_Ano_Leg-C_Co_NA_Vie_Fursi      53_Ano_Leg-C_Vi_NA_Vie_Marguerite 
##                                      5                                      5 
##         54_Ano_Leg-C_Vi_NA_Vie_Pelagie          55_Ano_Leg-C_Co_NA_Vie_Simeon 
##                                      5                                      5 
##        56_Ano_Leg-C_Co_NA_Vie_Mamertin          57_Ano_Leg-C_Vi_NA_Vie_Julien 
##                                      5                                      5 
## 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne       59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 
##                                      5                                      5 
##      60_Ano_Leg-B_NA_NA_NA_Antechriste 
##                                      4

Classes description

Most correlated features to the classification in general
maDesc = classesDesc(myCAH, d, k=5)
head(maDesc$quanti.var, n = 20)
##             Eta2      P-value
## m.m.e  0.9429217 7.035425e-33
## X..a.. 0.9202371 5.767917e-29
## o.m.m  0.8619993 1.451909e-22
## d.e..  0.7980146 3.951109e-18
## e.i.g  0.7891341 1.249124e-17
## q.i.l  0.7694745 1.353282e-16
## X..f.a 0.7634240 2.703410e-16
## q.u.e  0.7396851 3.468200e-15
## f.a.i  0.7348908 5.641721e-15
## s.i..  0.7347140 5.742831e-15
## l.s..  0.7336217 6.407583e-15
## a.i.n  0.7153801 3.740960e-14
## X..u.n 0.7019548 1.275323e-13
## X..i.h 0.6976194 1.872420e-13
## m.m.a  0.6958163 2.193063e-13
## e.l.a  0.6929311 2.818609e-13
## h.u..  0.6907259 3.409028e-13
## X..m.l 0.6893412 3.838705e-13
## e.u.p  0.6809368 7.800879e-13
## a.l.a  0.6651206 2.816989e-12
A = myDescPlot(relativeFreqs(data)["m.m.e", , drop = FALSE], classes, type = "violinplot", main = "m.m.e", ylab = "Relative frequency", xlab = "", classlabels = classlabels)
B = myDescPlot(relativeFreqs(data)["X..a..", , drop = FALSE], classes, type = "violinplot", main = "X..a..", ylab = "Relative frequency", xlab = "", classlabels = classlabels)
C = myDescPlot(relativeFreqs(data)["o.m.m", , drop = FALSE], classes, type = "violinplot", main = "o.m.m", ylab = "Relative frequency", xlab = "", classlabels = classlabels)
D = myDescPlot(relativeFreqs(data)["d.e..", , drop = FALSE], classes, type = "violinplot", main = "d.e..", ylab = "Relative frequency", xlab = "", classlabels = classlabels)
E = myDescPlot(relativeFreqs(data)["e.i.g", , drop = FALSE], classes, type = "violinplot", main = "e.i.g", ylab = "Relative frequency", xlab = "", classlabels = classlabels)
F = myDescPlot(relativeFreqs(data)["q.i.l", , drop = FALSE], classes, type = "violinplot", main = "q.i.l", ylab = "Relative frequency", xlab = "", classlabels = classlabels)


gridExtra::grid.arrange(A,B,C,D,E,F, ncol = 2)

Features and clusters (v-test, distribution,…): Wauchier
nfeats = 10
values = c(head(sort(maDesc$quanti$`3`[,1], decreasing = TRUE), n = nfeats), head(sort(maDesc$quanti$`3`[,1]), n = nfeats))
classBarplot(values, title="V-test for Wauchier class", ylab = "v-test")

Example of two main feats of Wauchier class

class = as.factor(classes)
levels(class) = classlabels
levels(class) = c(levels(class), "LAMB")
class["26_Ano_Leg-B_Ma_Ev_Vie_Lambert"] = "LAMB"
rf = cbind(as.data.frame(t(relativeFreqs(data))), class)
qplot(q.i.l, o.m.., colour=class, data = rf)

Specificities
specifPlot(data, myCAH, k = 5)

Transkribus expanded data

Load data

data = read.csv("data/transkr_student_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Forms from expanded data

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
WordsSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")

somCAH = somCluster(d)
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")

Affixes from expanded data

# Creating affixes database from all words
dataAffs = countAffixes(data)

Burrows + vector-length norm

d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")

Unstandardised function words from expanded data

Create function words list

#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")

Transkribus with linguistic annotation

POS 3-grams

data = read.csv("data/transkr_student_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")

Lemmas

data = read.csv("data/transkr_student_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")

Function words from lemmas

# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
data = stylo::load.corpus.and.parse(corpus.dir = "../dh-meier-data/output/transkribus-etudiants/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
## slicing input text into tokens...
## 
## turning words into features, e.g. char n-grams (if applicable)...
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")

Affixes + POS 3-gr + Function words (lemmas)

data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")

Plots

Analyses

#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)

gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)

Robustness

cahList = list(raw3grams = CAHRaw3gr, Forms = CAHForms, Affs = CAHAffs, FW = CAHFW, Lemmas = CAHLemmas, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Global = CAHGlob)
compareHC(cahList, k = 9)
##           raw3grams     Forms      Affs        FW    Lemmas FunctLemm    POS3gr
## raw3grams 1.0000000 0.8135593 0.8474576 0.8305085 0.7118644 0.6440678 0.6271186
## Forms     0.7966102 1.0000000 0.7966102 0.7457627 0.6440678 0.6440678 0.6101695
## Affs      0.8813559 0.8305085 1.0000000 0.7966102 0.6271186 0.6440678 0.6610169
## FW        0.8474576 0.7627119 0.7457627 1.0000000 0.6101695 0.6779661 0.6949153
## Lemmas    0.6949153 0.6271186 0.6271186 0.6101695 1.0000000 0.5084746 0.6101695
## FunctLemm 0.6610169 0.6440678 0.6271186 0.7118644 0.5423729 1.0000000 0.6610169
## POS3gr    0.5932203 0.6101695 0.5423729 0.6271186 0.6101695 0.5593220 1.0000000
## Global    0.8644068 0.8135593 0.7966102 0.8813559 0.6610169 0.6779661 0.6779661
##              Global
## raw3grams 0.8644068
## Forms     0.7966102
## Affs      0.8305085
## FW        0.8813559
## Lemmas    0.6440678
## FunctLemm 0.6610169
## POS3gr    0.6271186
## Global    1.0000000